# 单独运行，从下载好的HTML文档中提取目标URL
import re
import sys
import json
import time
import os

if len(sys.argv) != 3:
    print("[INFO] Usage: python extract_urls.py Html_files/JD文学小说.html JD")
    exit()

html_file = sys.argv[1]
app_name = sys.argv[2]

with open("config.json", encoding="utf-8") as fd:
    config = json.load(fd)

html_f = open(html_file, encoding="utf-8")
html_data = html_f.read()

pat = re.compile(config[app_name]["url_pattern"])
urls = pat.findall(html_data)
url_set = set()
for url in urls:
    tmp = url[6:-1]
    url_set.add(tmp)

ts = time.strftime("%m-%d_%H%M%S", time.localtime())
output_url = open("Url_dir/{}_{}.txt".format(os.path.basename(html_file), ts), 'w', encoding="utf-8")
for item in url_set:
    output_url.write(item + "\n")
output_url.close()
